library(tidyverse)
library(lubridate)
library(caret)
library(nnet)
library(plotly)
library(randomForest)

1. Data wrangling

First, we will import the raw data, clean them, and merge them according to our analysis needs. Then, we will take the monthly average to incorporate other data.

1.1 Moore reef data from AIMS

# function to clean and wrangle AIMS Moore Reef data to deal with column names and column placement
clean_MooreReef_data <- function(df) {
    df[,2:ncol(df)] <- df[,1:ncol(df)]
    df[,1] <- rownames(df)
    df <- df %>% select(date, colnames(df)[ncol(df)])
    return(df)
}
moore_reef_water_temp_2.0m <- read.csv("raw_data/temp/AIMS_MooreReef_WaterTemperature_22Oct1997to16Feb2020_2.0m.csv", skip=108, sep= ",", row.names = NULL)
moore_reef_water_temp_9.0m <- read.csv("raw_data/temp/AIMS_MooreReef_WaterTemperature_22Oct1997to17Dec2017_9.0m.csv", skip=94, sep= ",", row.names = NULL)


# run through function defined above to clean and wrangle data
moore_reef_water_temp_2.0m <- clean_MooreReef_data(moore_reef_water_temp_2.0m)
## Warning in `[<-.data.frame`(`*tmp*`, , 2:ncol(df), value = structure(list(:
## provided 6 variables to replace 5 variables
moore_reef_water_temp_9.0m <- clean_MooreReef_data(moore_reef_water_temp_9.0m)
## Warning in `[<-.data.frame`(`*tmp*`, , 2:ncol(df), value = structure(list(:
## provided 6 variables to replace 5 variables
moore_reef_water_temp_2.0m <- moore_reef_water_temp_2.0m %>%
    filter(Water.Temperature..2.0m.MORFL1.Reef.Flat.Site.1_LEVEL2_value_AVG != "Not available")

moore_reef_water_temp_9.0m <- moore_reef_water_temp_9.0m %>%
    filter(Water.Temperature..9.0m.MORSL1.Reef.Slope.Site.1_LEVEL2_value_AVG != "Not available")
# merge AIMS Moore reef temperature data
aims_temp_data <- Reduce(function(x,y) merge(x,y, by="date"), list(moore_reef_water_temp_2.0m,
                                                              moore_reef_water_temp_9.0m))
                                                            
# convert water temp data from string to numeric type
aims_temp_data$Water.Temperature..2.0m.MORFL1.Reef.Flat.Site.1_LEVEL2_value_AVG <- 
    as.numeric(as.character(aims_temp_data$Water.Temperature..2.0m.MORFL1.Reef.Flat.Site.1_LEVEL2_value_AVG))

aims_temp_data$Water.Temperature..9.0m.MORSL1.Reef.Slope.Site.1_LEVEL2_value_AVG <- 
    as.numeric(as.character(aims_temp_data$Water.Temperature..9.0m.MORSL1.Reef.Slope.Site.1_LEVEL2_value_AVG))

# convert to date column into date type
aims_temp_data$date <- as.Date(aims_temp_data$date)

colnames(aims_temp_data) <- c("date", "avg_water_temp_2.0m_flat_site", "avg_water_temp_9.0m_slope_site")


# write combined AIMS Moore reef data
write.csv(aims_temp_data, "cleaned_data/aims_temperatures.csv", row.names = FALSE)

1.2 Coral cover data

# convert date-decimal in coral cover to YYYY-MM-DD format
coral_cover <- read.csv("raw_data/trendgbr-coral-cover-with-ci.csv")
coral_cover$Date <- as.Date(format(date_decimal(coral_cover$Date), "%Y-%m-%d"))

# rename "Date" to "date"
names(coral_cover)[names(coral_cover) == "Date"] <- "date"
colnames(coral_cover) <- c("date", "mean_live_coral_cover_percent", "lower_conf_int", "upper_conf_int", "conf_int_span")
write.csv(coral_cover, "cleaned_data/coral_cover.csv", row.names = FALSE)

1.3 Fish census data

fish_census <- read.csv("raw_data/Fish census 1992-2015.csv", sep="\t", header=T)

fish_census <- fish_census %>% select(gbifID, class, family, genus, species, verbatimScientificName, decimalLatitude, decimalLongitude, dateIdentified)
fish_census$dateIdentified <- ymd_hms(fish_census$dateIdentified)

Group fish census by date, then count how many unique fish species were observed on a given date

fish_species_counts <- fish_census %>%
    arrange(dateIdentified) %>%
    group_by(dateIdentified) %>%
    summarise(num_of_species=n_distinct(species))
## `summarise()` ungrouping output (override with `.groups` argument)
# rename "dateIdentified" to "date"
names(fish_species_counts)[names(fish_species_counts) == "dateIdentified"] <- "date"

fish_species_counts$date <- as.Date(fish_species_counts$date)

write.csv(fish_species_counts, "cleaned_data/fish_species_counts.csv", row.names = FALSE)

1.4 Sea grass data

We would like to see whether we can classify sea grass species based on the survey location (latitude, longitude, and depth below sea level), and the type of sediment and seabed in which they were discovered.

We decided not to use sea grass species belonging to the Halophila genus because they are so widespread and common in tropical waters. Since the halophila genus were also present in nearly all the survey sites where other sea grass species were found, we determined that it did not make much sense from a scientific perspective, as well as from the data set we were given.

seagrass_data <- read.csv("raw_data/GBR_NESP-TWQ-3.2.1-5.4_JCU_Seagrass_1984-2018_Site-surveys.csv") %>%
    # filter out for rows where we have information
    filter(SEDIMENT != "Not recorded" & PRESENCE_A == "Present") %>%

    # delete columns we are not going to use
    select(-FID, -MONTH, -YEAR, 
           -SURVEY_MET, -SURVEY_NAM, 
           # remove seagrass belonging to halophila genus
           -H_CAPRICOR, -H_TRICOSTA, -H_OVALIS, -H_UNINERVI, -H_DECIPIEN, -H_SPINULOS) 

Let’s examine how many presence/absence we see for each sea grass species:

table(seagrass_data$C_ROTUNDAT)
## 
##    No   Yes 
## 30366   187
table(seagrass_data$C_SERRULAT)
## 
##    No   Yes 
## 28899  1654
table(seagrass_data$E_ACOROIDE)
## 
##    No   Yes 
## 30494    59
table(seagrass_data$S_ISOETIFO)
## 
##    No   Yes 
## 30353   200
table(seagrass_data$T_CILIATUM)
## 
##    No 
## 30553
table(seagrass_data$T_HEMPRICH)
## 
##    No   Yes 
## 29674   879
table(seagrass_data$Z_CAPRICOR)
## 
##    No   Yes 
## 20074 10479

We see that there is actually no data at all for presence of T_CILIATUM. In addition, there are only 59 observations for E_ACOROIDE and 187 observations for C_ROTUNDAT that are actually useful in classifying these two species. Because we have a rather large data set, we want 200 or more observations for our classification model.So, we remove these columns and from our classification problem due to insufficient data.

seagrass_data <- seagrass_data %>% select(-C_ROTUNDAT, -T_CILIATUM, -E_ACOROIDE)

Since we deleted some columns, there may be some observations where all the remaining species columns have “No” values for absence. So, we filter out for rows where presence of at least one species was observed.

seagrass_data <- seagrass_data %>%
    filter(C_SERRULAT=="Yes" | 
           S_ISOETIFO=="Yes" |
           T_HEMPRICH=="Yes" |
           Z_CAPRICOR=="Yes")

Now we want to count how many species were found at each location site.

count_species_present <- function(C_SERRULAT, S_ISOETIFO, T_HEMPRICH, Z_CAPRICOR) {
    count = 0
    
    if (C_SERRULAT=="Yes") {
        count <- count + 1
    }
    if (S_ISOETIFO=="Yes") {
        count <- count + 1
    }
    if (T_HEMPRICH=="Yes") {
        count <- count + 1
    }
    if (Z_CAPRICOR=="Yes") {
        count <- count + 1
    }
    
    return(count)
}

seagrass_data$num_species_present <- mapply(count_species_present, 
                                            seagrass_data$C_SERRULAT,
                                            seagrass_data$S_ISOETIFO,
                                            seagrass_data$T_HEMPRICH,
                                            seagrass_data$Z_CAPRICOR)

How many survey sites had more than 1 species discovered?

table(seagrass_data$num_species_present)
## 
##     1     2     3 
## 12381   411     3
nrow(seagrass_data)
## [1] 12795

We see that only 3.2% of total survey sites recorded more than 1 species observed. For ease of building a classification model, we will remove these rows. We do not want a situation where our model cannot “decide” in classifying our observations. Because we removed only about 3% of over 12,500 observations, we still preserve statistical power.

seagrass_data <- seagrass_data %>% filter(num_species_present < 2)
table(seagrass_data$C_SERRULAT)
## 
##    No   Yes 
## 11125  1256
table(seagrass_data$S_ISOETIFO)
## 
##    No   Yes 
## 12280   101
table(seagrass_data$T_HEMPRICH)
## 
##    No   Yes 
## 11558   823
table(seagrass_data$Z_CAPRICOR)
## 
##    No   Yes 
##  2180 10201

So, after some data wrangling and exploratory analysis, we will build a model to classify 4 species of seagrass: Cymodocea Serrulata , Syringodium Isoetifolium, Thalassia Hemprichii, and Zostera Muelleri (subspecies Capricorni). Now, let’s build a SPECIES column that collects all the presence in a single variable.

# function to make a species column
get_species_type <- function(C_SERRULAT, S_ISOETIFO, T_HEMPRICH, Z_CAPRICOR) {
    if (C_SERRULAT=="Yes") {
        return("C_SERRULAT")
    } else if (S_ISOETIFO=="Yes") {
        return("S_ISOETIFO")
    } else if (T_HEMPRICH =="Yes") {
        return("T_HEMPRICH")
    } else if (Z_CAPRICOR =="Yes") {
        return("Z_CAPIRCOR")
    }
}

# build species column to classify species of each observation based on presence/absence
seagrass_data$SPECIES <- mapply(get_species_type, 
                                seagrass_data$C_SERRULAT, 
                                seagrass_data$S_ISOETIFO,
                                seagrass_data$T_HEMPRICH,
                                seagrass_data$Z_CAPRICOR)

table(seagrass_data$SPECIES)
## 
## C_SERRULAT S_ISOETIFO T_HEMPRICH Z_CAPIRCOR 
##       1256        101        823      10201

Data frame cleanup

# convert SPECIES to unordered factor
seagrass_data$SPECIES <- factor(seagrass_data$SPECIES, ordered=FALSE)

# rename misspelled column in original data set
names(seagrass_data)[names(seagrass_data) == "LATITUTDE"] <- "LATITUDE"

# only select columns relevant for our ML algorithm
seagrass_data <- seagrass_data %>% 
    select(SPECIES, LATITUDE, LONGITUDE, DEPTH, SEDIMENT, TIDAL)

Write to .csv file.

write.csv(seagrass_data, "cleaned_data/seagrass_classification_data.csv", row.names = FALSE)




2. Regression and machine learning

2.1 Effect of sea water temperature on fish in the Great Barrier Reef

2.1.1 Exploratory data analysis and visualization

In this analysis, we will examine the effects of water temperature on the number of unique fish species observed in the Great Barrier Reef from 1997 to 2011.

Originally, we wanted to examine a relationship between water temperature and coral cover. However, we quickly saw that, when visualized, there does not seem to be a linear relation and we did not want to fit a mis-specified model.

Merge between temperature and coral cover data sets

temp <- read_csv("cleaned_data/aims_temperatures.csv")
## Parsed with column specification:
## cols(
##   date = col_date(format = ""),
##   avg_water_temp_2.0m_flat_site = col_double(),
##   avg_water_temp_9.0m_slope_site = col_double()
## )
coral_cover <- read_csv("cleaned_data/coral_cover.csv")
## Parsed with column specification:
## cols(
##   date = col_date(format = ""),
##   mean_live_coral_cover_percent = col_double(),
##   lower_conf_int = col_double(),
##   upper_conf_int = col_double(),
##   conf_int_span = col_double()
## )
temp_coral_cover <- merge(x=temp, y=coral_cover, by="date")
temp_coral_cover %>%
    ggplot(aes(avg_water_temp_2.0m_flat_site, mean_live_coral_cover_percent)) + 
    geom_point()

temp_coral_cover %>%
    ggplot(aes(avg_water_temp_9.0m_slope_site, mean_live_coral_cover_percent)) + 
    geom_point()

As we can see, there is no linear relationship. While there are 2 or 3 clusters of coral cover percentage values, they seem to be pretty consistent across the range of water temperatures at 2.0m and 9.0 below sea level.

We also suspect that increase in water temperature could also affect diversity in sea life. So, we will examine the relationship between water temperature and the number of unique species of fish observed in the Great Barrier Ree. First, we can do some basic visualization by plotting the relationship between water temperatures at depth 2.0m and 9.0m with the number of unique fish species observedin the Great Barrier Reef.

fish <- read_csv("cleaned_data/fish_species_counts.csv")
## Parsed with column specification:
## cols(
##   date = col_date(format = ""),
##   num_of_species = col_double()
## )
fish_temp <- merge(x=temp, y=fish, by="date")

# water temp at 2.0m
fish_temp %>% ggplot(aes(avg_water_temp_2.0m_flat_site, num_of_species)) + geom_point()

# water temp at92.0m
fish_temp %>% ggplot(aes(avg_water_temp_9.0m_slope_site, num_of_species)) + geom_point()

There seems to be a bit of negative linear relationship going on, so we will fit a linear model examining the number of unique species discovered in relation to rising temperature.

2.1.2 Linear regression

Now, we can split our data set into train and test sets, using 0.6 to partition our data. Our outcome is the mean coral cover percentage and water temperature is our covariate. We will fit 2 linear regression models: one examining effect of water temperature at 2.0m and the other examining the effect of temperature at 9.0m.

train_index <- createDataPartition(y=fish_temp$num_of_species, times=1, p = 0.6, list=FALSE)

train_set <- fish_temp[train_index, ]
test_set <- fish_temp[-train_index, ]

Fit linear regression model:

fish_temp_2.0m <- lm(num_of_species ~ avg_water_temp_2.0m_flat_site, data=train_set)
summary(fish_temp_2.0m)
## 
## Call:
## lm(formula = num_of_species ~ avg_water_temp_2.0m_flat_site, 
##     data = train_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -59.879  -8.923   2.119  10.254  39.055 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   117.7762    17.7872   6.621 3.12e-10 ***
## avg_water_temp_2.0m_flat_site  -2.0309     0.6461  -3.143  0.00192 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.34 on 203 degrees of freedom
## Multiple R-squared:  0.04641,    Adjusted R-squared:  0.04172 
## F-statistic: 9.881 on 1 and 203 DF,  p-value: 0.00192
fish_temp_9.0m <- lm(num_of_species ~ avg_water_temp_9.0m_slope_site, data=train_set)
summary(fish_temp_9.0m)
## 
## Call:
## lm(formula = num_of_species ~ avg_water_temp_9.0m_slope_site, 
##     data = train_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -59.966  -8.802   1.979  10.327  39.395 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    119.8436    18.0473   6.641  2.8e-10 ***
## avg_water_temp_9.0m_slope_site  -2.1144     0.6582  -3.213  0.00153 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.33 on 203 degrees of freedom
## Multiple R-squared:  0.04838,    Adjusted R-squared:  0.0437 
## F-statistic: 10.32 on 1 and 203 DF,  p-value: 0.00153

We see that the models are very similar in results. The coefficient with covariate 2.0m water temperature and 9.0 water temperature is -2.56 and -2.61, respectively.

Although we should not expect a major difference between how each of the two models performs, let’s compare them anyways to assess which water temperature depth is a better predictor of unique fish species observed in the Great Barrier Reef.

pred_2.0m <- predict(fish_temp_2.0m, test_set)
pred_9.0m <- predict(fish_temp_9.0m, test_set)

postResample(pred = pred_2.0m, obs = test_set$num_of_species)
##       RMSE   Rsquared        MAE 
## 14.9712171  0.1295988 11.3846304
postResample(pred = pred_9.0m, obs = test_set$num_of_species)
##       RMSE   Rsquared        MAE 
## 14.9392278  0.1351032 11.3419986

As expected, results are very similar.

We can assess this visually to confirm our results.

# water temp at 2.0m
test_set %>% 
    ggplot(aes(avg_water_temp_2.0m_flat_site, num_of_species)) + 
    geom_point() +
    geom_abline(intercept=fish_temp_2.0m$coefficients[1], slope=fish_temp_2.0m$coefficients[2], col="red")

# water temp at 9.0m
test_set %>% 
    ggplot(aes(avg_water_temp_9.0m_slope_site, num_of_species)) + 
    geom_point() +
    geom_abline(intercept=fish_temp_9.0m$coefficients[1], slope=fish_temp_9.0m$coefficients[2], col="blue")

They both perform very similarly, and choosing either water temperature as our predictor will yield similar results.



2.2 Classification of sea grass species in the Great Barrier Reef from 1999 - 2003

Continuing on from sea life diversity, we have data on presence or absence of certain seagrass species in the Great Barrier Reef from 1999 to 2003. Let’s try to build a classifier to determine how location, and types of sediment and seabed may predict presence of certain sea grass.

As written in the data wrangling and cleaning RMarkdown/HTML file, we chose 4 species to classify: Cymodocea Serrulata, Syringodium Isoetifolium, Thalassia Hemprichii, and Zostera Muelleri (subspecies Capricorni).

2.2.1 Exploratory data analysis and visualization

seagrass <- read.csv("cleaned_data/seagrass_classification_data.csv", as.is =TRUE)

seagrass$SPECIES <- as.factor(seagrass$SPECIES)
seagrass$SEDIMENT <- as.factor(seagrass$SEDIMENT)
seagrass$TIDAL <- as.factor(seagrass$TIDAL)

head(seagrass)
##      SPECIES  LATITUDE LONGITUDE DEPTH SEDIMENT      TIDAL
## 1 Z_CAPIRCOR -23.65857  151.1206     0      Mud intertidal
## 2 Z_CAPIRCOR -23.65840  151.1212     0      Mud intertidal
## 3 Z_CAPIRCOR -23.65898  151.1203     0      Mud intertidal
## 4 Z_CAPIRCOR -23.65970  151.1197     0      Mud intertidal
## 5 Z_CAPIRCOR -23.65884  151.1205     0      Mud intertidal
## 6 Z_CAPIRCOR -23.65993  151.1197     0      Mud intertidal
summary(seagrass)
##        SPECIES         LATITUDE        LONGITUDE         DEPTH        
##  C_SERRULAT: 1256   Min.   :-24.20   Min.   :142.5   Min.   : 0.0000  
##  S_ISOETIFO:  101   1st Qu.:-23.79   1st Qu.:146.8   1st Qu.: 0.0000  
##  T_HEMPRICH:  823   Median :-22.41   Median :150.5   Median : 0.0000  
##  Z_CAPIRCOR:10201   Mean   :-21.06   Mean   :149.0   Mean   : 0.9253  
##                     3rd Qu.:-19.17   3rd Qu.:151.3   3rd Qu.: 1.2279  
##                     Max.   :-10.75   Max.   :151.9   Max.   :29.3583  
##                                                                       
##    SEDIMENT           TIDAL     
##  Gravel:   1   intertidal:7433  
##  Mud   :8969   subtidal  :4948  
##  Reef  :  63                    
##  Rock  :  66                    
##  Rubble: 107                    
##  Sand  :3151                    
##  Shell :  24

First we plot sea grass according to location (latitude and longitude). Then we will add a third axis (depth) to visualize this in 3-dimensions using plotly package. Since depth is measured in meters below sea level, we visualize this in negative values.

seagrass %>%  ggplot() + geom_point(aes(x=LATITUDE, y=LONGITUDE, color=SPECIES))

plot_ly(seagrass, x=~LATITUDE, y=~LONGITUDE, z=~-DEPTH, color=~SPECIES, type="scatter3d", mode="markers")
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

We can also see how our categorical predictors relate to sea grass species.

seagrass %>% 
    ggplot(aes(SEDIMENT, fill=SPECIES)) + geom_bar(width=.5, position = "dodge")

seagrass %>% 
    ggplot(aes(TIDAL, fill=SPECIES)) + geom_bar(width=.5, position = "dodge")

2.2.2 Random forest

We will first use random forest to build a classifier and then use a multinomial logistic regression model, and compare the two.

Let’s first partition our data set into train and test sets. Since we have a lot more data here than in the linear regression model, we will partition it by 75%-25%.

# test-train split
seagrass_train_ind <- createDataPartition(y = seagrass$SPECIES, p=0.75, list=FALSE)

train_set <- seagrass[seagrass_train_ind, ]
test_set <- seagrass[-seagrass_train_ind, ]
rf_fit <- randomForest(SPECIES ~ LATITUDE + LONGITUDE + DEPTH + SEDIMENT + TIDAL, 
                       data=train_set,
                       mtry = 2)
rf_fit
## 
## Call:
##  randomForest(formula = SPECIES ~ LATITUDE + LONGITUDE + DEPTH +      SEDIMENT + TIDAL, data = train_set, mtry = 2) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 4.71%
## Confusion matrix:
##            C_SERRULAT S_ISOETIFO T_HEMPRICH Z_CAPIRCOR class.error
## C_SERRULAT        705         22         10        205  0.25159236
## S_ISOETIFO         31         31          4         10  0.59210526
## T_HEMPRICH         43          2        561         12  0.09223301
## Z_CAPIRCOR         93          2          3       7553  0.01280878
rf_pred <- predict(rf_fit, newdata = test_set, type = "response")
confusionMatrix(table(pred = rf_pred, true = test_set$SPECIES))
## Confusion Matrix and Statistics
## 
##             true
## pred         C_SERRULAT S_ISOETIFO T_HEMPRICH Z_CAPIRCOR
##   C_SERRULAT        243          9         16         24
##   S_ISOETIFO          4          8          1          1
##   T_HEMPRICH          3          3        184          2
##   Z_CAPIRCOR         64          5          4       2523
## 
## Overall Statistics
##                                          
##                Accuracy : 0.956          
##                  95% CI : (0.9482, 0.963)
##     No Information Rate : 0.8242         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.8509         
##                                          
##  Mcnemar's Test P-Value : 9.047e-06      
## 
## Statistics by Class:
## 
##                      Class: C_SERRULAT Class: S_ISOETIFO Class: T_HEMPRICH
## Sensitivity                    0.77389          0.320000           0.89756
## Specificity                    0.98237          0.998045           0.99723
## Pos Pred Value                 0.83219          0.571429           0.95833
## Neg Pred Value                 0.97466          0.994481           0.99276
## Prevalence                     0.10149          0.008080           0.06626
## Detection Rate                 0.07854          0.002586           0.05947
## Detection Prevalence           0.09438          0.004525           0.06206
## Balanced Accuracy              0.87813          0.659022           0.94740
##                      Class: Z_CAPIRCOR
## Sensitivity                     0.9894
## Specificity                     0.8658
## Pos Pred Value                  0.9719
## Neg Pred Value                  0.9458
## Prevalence                      0.8242
## Detection Rate                  0.8154
## Detection Prevalence            0.8390
## Balanced Accuracy               0.9276

We see that our classification model works quite well, especially for T_HEMPRICH and Z_CAPRICOR, which have 85%+ sensitivity and specificity. However, we got quite a low sensitivity for S_ISOETIFO. Recall to our data wrangling portion that S_ISOETIFO had only about 100 “Yes” observations. Since we had a small sample size for S_ISOETIFO relative to the other 3 seagrass species, this may have contributed to the low sensitivity.

We can visually assess our predicted values with true values of species to see how our model performed.

# true values
plot_ly(test_set, x=~LATITUDE, y=~LONGITUDE, z=~-DEPTH, color=~SPECIES, type="scatter3d", mode="markers")
# predicted values
plot_ly(test_set, x=~LATITUDE, y=~LONGITUDE, z=~-DEPTH, color=~rf_pred, type="scatter3d", mode="markers")

For sediment type:

test_set %>% 
    ggplot(aes(SEDIMENT, fill=SPECIES)) + geom_bar(width=.5, position = "dodge")

test_set %>% 
    ggplot(aes(SEDIMENT, fill=rf_pred)) + geom_bar(width=.5, position = "dodge")

For seabed type:

test_set %>% 
    ggplot(aes(TIDAL, fill=SPECIES)) + geom_bar(width=.5, position = "dodge")

test_set %>% 
    ggplot(aes(TIDAL, fill=rf_pred)) + geom_bar(width=.5, position = "dodge")

Let’s examine variable importance.

variable_importance <- importance(rf_fit)
tmp <- data_frame(feature = rownames(variable_importance),
                  Gini = variable_importance[,1]) %>%
                  arrange(desc(Gini))
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
tmp
## # A tibble: 5 x 2
##   feature    Gini
##   <chr>     <dbl>
## 1 LATITUDE  931. 
## 2 LONGITUDE 878. 
## 3 DEPTH     616. 
## 4 TIDAL     115. 
## 5 SEDIMENT   86.0

We see that longitude and latitude were very predictive of presence of seagrass followed by depth from sea level. The types of sediment and seabed (intertidal or subtidal seabed) are not very good predictors. Thus, it seems that the location of where the sea grass was discovered matters more than the various ocean floor properties.

tmp %>% ggplot(aes(x=reorder(feature, Gini), y=Gini)) + 
  geom_bar(stat='identity') +
  coord_flip() + xlab("Feature") +
  theme(axis.text=element_text(size=8))

2.2.3 Multinomial logistic regression

We can now try a multinomial logistic regression model to see how it compares to random forest. We will use the nnet package.

The logistic regression model is as follows:

multinom_fit <- multinom(SPECIES ~ LATITUDE + LONGITUDE + DEPTH + SEDIMENT, data=train_set)
## # weights:  44 (30 variable)
## initial  value 12874.515732 
## iter  10 value 3313.265166
## iter  20 value 2723.852573
## iter  30 value 2565.896265
## iter  40 value 2503.556100
## iter  50 value 2474.035234
## iter  60 value 2451.155274
## iter  70 value 2451.106035
## iter  80 value 2451.001366
## iter  90 value 2449.174920
## iter 100 value 2448.479063
## final  value 2448.479063 
## stopped after 100 iterations
summary(multinom_fit)
## Call:
## multinom(formula = SPECIES ~ LATITUDE + LONGITUDE + DEPTH + SEDIMENT, 
##     data = train_set)
## 
## Coefficients:
##            (Intercept)  LATITUDE LONGITUDE        DEPTH SEDIMENTMud
## S_ISOETIFO   -385.0541 2.1687827  2.778821  0.001477874   13.546418
## T_HEMPRICH   -160.9830 1.8802818  1.264179 -0.311850477    6.952428
## Z_CAPIRCOR   -301.5464 0.7094671  1.489861 -0.707840969   98.630442
##            SEDIMENTReef SEDIMENTRock SEDIMENTRubble SEDIMENTSand SEDIMENTShell
## S_ISOETIFO   -56.575700    14.380473      -29.68991     14.66912      15.43732
## T_HEMPRICH     9.809814     9.855825       11.58179     10.12414      10.17451
## Z_CAPIRCOR    20.151148    95.419866       94.53164     97.80779      97.59584
## 
## Std. Errors:
##            (Intercept)   LATITUDE   LONGITUDE      DEPTH SEDIMENTMud
## S_ISOETIFO 0.002841091 0.05770350 0.007860097 0.02442800   0.4042663
## T_HEMPRICH 0.001276092 0.05726438 0.007293784 0.04166785   0.3294196
## Z_CAPIRCOR 0.002092067 0.02798120 0.004569543 0.02699023   0.3573032
##            SEDIMENTReef SEDIMENTRock SEDIMENTRubble SEDIMENTSand SEDIMENTShell
## S_ISOETIFO 5.515254e-14    0.6795060   4.212678e-15    0.3748700     0.8839106
## T_HEMPRICH 5.580735e-01    0.6413926   4.695396e-01    0.3014330     1.1506238
## Z_CAPIRCOR 1.662192e-33    0.6603028   1.026883e+00    0.3586384     0.8662218
## 
## Residual Deviance: 4896.958 
## AIC: 4956.958

Relative risk ratios where reference group is C_SERRULAT

exp(coef(multinom_fit))
##              (Intercept) LATITUDE LONGITUDE     DEPTH  SEDIMENTMud SEDIMENTReef
## S_ISOETIFO 5.931185e-168 8.747629 16.100021 1.0014790 7.640729e+05 2.688350e-25
## T_HEMPRICH  1.218886e-70 6.555352  3.540184 0.7320910 1.045685e+03 1.821160e+04
## Z_CAPIRCOR 1.096588e-131 2.032908  4.436480 0.4927068 6.833715e+42 5.643291e+08
##            SEDIMENTRock SEDIMENTRubble SEDIMENTSand SEDIMENTShell
## S_ISOETIFO 1.759381e+06   1.275954e-13 2.348111e+06  5.062256e+06
## T_HEMPRICH 1.906910e+04   1.071295e+05 2.493792e+04  2.622622e+04
## Z_CAPIRCOR 2.756268e+41   1.133883e+41 3.001819e+42  2.428484e+42
# predicted probabilities
predicted_prob <- predict(multinom_fit, newdata=test_set, type="probs")

# predicted classes
predicted_class <- predict(multinom_fit, newdata=test_set, type="class")

confusionMatrix(data = predicted_class, reference = test_set$SPECIES )
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   C_SERRULAT S_ISOETIFO T_HEMPRICH Z_CAPIRCOR
##   C_SERRULAT        125          9          6         15
##   S_ISOETIFO          0          0          0          0
##   T_HEMPRICH         16          2        177         20
##   Z_CAPIRCOR        173         14         22       2515
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9105          
##                  95% CI : (0.8999, 0.9203)
##     No Information Rate : 0.8242          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6618          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: C_SERRULAT Class: S_ISOETIFO Class: T_HEMPRICH
## Sensitivity                     0.3981           0.00000           0.86341
## Specificity                     0.9892           1.00000           0.98685
## Pos Pred Value                  0.8065               NaN           0.82326
## Neg Pred Value                  0.9357           0.99192           0.99027
## Prevalence                      0.1015           0.00808           0.06626
## Detection Rate                  0.0404           0.00000           0.05721
## Detection Prevalence            0.0501           0.00000           0.06949
## Balanced Accuracy               0.6936           0.50000           0.92513
##                      Class: Z_CAPIRCOR
## Sensitivity                     0.9863
## Specificity                     0.6158
## Pos Pred Value                  0.9233
## Neg Pred Value                  0.9054
## Prevalence                      0.8242
## Detection Rate                  0.8129
## Detection Prevalence            0.8804
## Balanced Accuracy               0.8010

We see that our multinomial logistic model has about 91% overall accuracy, which performs a bit worse than random forest. However, the model performs very badly in predicting for S_ISOETIFO.

The model seems to predict T_HEMPRICH the best with 88.7% sensitivity and 98.8% specificity. The model also does not perform well for sensitivity of C_SERRULAT, with only about 41.7% sensitivity.

Again, we can assess our results visually:

# true values
plot_ly(test_set, x=~LATITUDE, y=~LONGITUDE, z=~-DEPTH, color=~SPECIES, type="scatter3d", mode="markers")
# predicted values
plot_ly(test_set, x=~LATITUDE, y=~LONGITUDE, z=~-DEPTH, color=~predicted_class, type="scatter3d", mode="markers")

For sediment type:

test_set %>% 
    ggplot(aes(SEDIMENT, fill=SPECIES)) + geom_bar(width=.5, position = "dodge")

test_set %>% 
    ggplot(aes(SEDIMENT, fill=predicted_class)) + geom_bar(width=.5, position = "dodge")

For seabed type:

test_set %>% 
    ggplot(aes(TIDAL, fill=SPECIES)) + geom_bar(width=.5, position = "dodge")

test_set %>% 
    ggplot(aes(TIDAL, fill=predicted_class)) + geom_bar(width=.5, position = "dodge")

We can confirm in our visualizations that the logistic regression model failed to predict for S_ISOETIFO.

So, we see that the overall accuracy for multinomial logistic regression model vs random forest model was 95.6% and 90.9%, respectively. However, the overall accuracy stated for the logistic regression model is deceiving since it did not perform well in sensitivity in 2 out of 4 classes.